In [1]:
import pandas as pd
import geopandas as gpd
import shapely.wkt
import urllib
import os
import numpy as np
In [2]:
# if running on Colab, uncomment and run this line below too:
# !pip install mapclassify
In [3]:
output_dir = "output/"
os.makedirs(output_dir, exist_ok=True)
In [4]:
# Functions

def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select entity as organisation_entity, name, organisation, dataset, local_planning_authority, local_authority_district,
        case when dataset = "local-authority" then local_authority_district else local_planning_authority end as statistical_geography
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df


def get_pdp_geo_dataset(dataset, underscore_cols=True, crs_out=27700):

    url = f"https://files.planning.data.gov.uk/dataset/{dataset}.geojson"
    gdf = gpd.read_file(url)

    if underscore_cols:
        gdf.columns = [x.replace("-", "_") for x in gdf.columns]


    gdf.set_crs(epsg=4326, inplace=True)
    gdf.to_crs(epsg=crs_out, inplace=True)

    return gdf

Data in¶

In [5]:
# get org lookup
org_df = get_all_organisations()
print(len(org_df))
452
In [6]:
# read in manual count sheet
con_count_df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSGZIudsGx0ez4cU-4wSvymvXIFfpDb_qfbS3uW5RiuBkJrJQ9D8k0HBUPtgncRXA/pub?gid=485605871&single=true&output=csv")
con_count_df.columns = [x.replace("-", "_") for x in con_count_df.columns]

# join on organisation names and LPA codes
con_count_lpa_df = con_count_df.merge(
    org_df[["organisation_entity", "name", "local_planning_authority"]],
    how = "left",
    on = "organisation_entity"
)

print(len(con_count_lpa_df))
# con_count_lpa_df.head()
328
In [7]:
# CA from pdp
ca_df = pd.read_csv("https://files.planning.data.gov.uk/dataset/conservation-area.csv",
                            usecols = ["entity", "name", "organisation-entity", "reference", "entry-date", "point"])

ca_df.columns = [x.replace("-", "_") for x in ca_df.columns]

# load to gdf
ca_df["point"] = ca_df["point"].apply(shapely.wkt.loads)
ca_gdf = gpd.GeoDataFrame(ca_df, geometry='point')

# Transform to ESPG:27700 for more interpretable area units
ca_gdf.set_crs(epsg=4326, inplace=True)
ca_gdf.to_crs(epsg=27700, inplace=True)
In [8]:
# Latest ONS LPA file, for flagging whether pdp LPAs are 2023 or not
ons_lpa_gpd = gpd.read_file("https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/Local_Planning_Authorities_April_2023_Boundaries_UK_BGC/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson",)

print(len(ons_lpa_gpd))
# ons_lpa_gpd.head()
379
In [9]:
# LPA boundaries from PDP site
lpa_gdf = get_pdp_geo_dataset("local-planning-authority")

lpa_gdf["lpa_2023"] = np.where(lpa_gdf["reference"].isin(ons_lpa_gpd["LPA23CD"]), True, False)
lpa_gdf.rename(columns={'name':'lpa_name'}, inplace=True)

print(len(lpa_gdf))
# lpa_gdf.head()
337

Analysis¶

Spatial joining¶

In [10]:
# join LPAs to all conservation areas, then join on the names of supplying organisations for matching conservation areas
lpa_ca_join = gpd.sjoin(
    lpa_gdf[["reference", "lpa_name", "lpa_2023", "geometry"]],
    ca_gdf[["entity", "organisation_entity", "point"]],
    how = "left",
    predicate = "intersects"
).merge(
    org_df[["organisation_entity", "name"]],
    how = "left",
    on = "organisation_entity"
)

lpa_ca_join["name"] = lpa_ca_join["name"].astype(str)

print(len(lpa_ca_join))
# lpa_ca_join.head()
9275
In [11]:
# flag the providing org type - ranking so when we group and count we can count areas with LPA and Historic England providing as LPA
lpa_ca_join["org_type_rank"] = np.select(
    [
        (lpa_ca_join["organisation_entity"] != 16) & (lpa_ca_join["organisation_entity"].notnull()),
        lpa_ca_join["organisation_entity"] == 16
    ],
    [1, 2],
    default = 3)

# lpa_ca_join.head()
In [12]:
# count no. of conservation areas per LPA then join on the manual counts
lpa_ca_join_count = lpa_ca_join.groupby(
        ["reference", "lpa_name", "lpa_2023"]
    ).agg(
        {"entity" : "count",
         "name" : lambda x: ', '.join(set(x)),
         "org_type_rank" : "min"}
    ).reset_index(    
    ).merge(
        con_count_lpa_df[["local_planning_authority", "name", "conservation_area_count"]],
        how = "left",
        left_on = "reference",
        right_on = "local_planning_authority"
    )

# rename cols
lpa_ca_join_count.rename(columns=
                         {"entity":"count_platform", 
                          "name_x":"platform_data_providers", 
                          "conservation_area_count":"count_manual",
                          "name_y":"lpa_name_manual"}, inplace = True)

# calculate count comparison delta
lpa_ca_join_count["count_delta"] = (lpa_ca_join_count["count_platform"] - lpa_ca_join_count["count_manual"]) / lpa_ca_join_count["count_manual"]
lpa_ca_join_count["count_delta_abs"] = abs(lpa_ca_join_count["count_delta"])
# use org type rank to flag the best provider for an area
lpa_ca_join_count["provider_org_type"] = lpa_ca_join_count["org_type_rank"].map({1:"LPA", 2:"Historic England", 3:"None"})

# write to csv
lpa_ca_join_count.to_csv(os.path.join(output_dir, "LPA_conservation_area_count_comparison.csv"), index = False)

lpa_ca_join_count.head()
Out[12]:
reference lpa_name lpa_2023 count_platform platform_data_providers org_type_rank local_planning_authority lpa_name_manual count_manual count_delta count_delta_abs provider_org_type
0 E60000001 County Durham LPA True 92 Historic England 2 E60000001 Durham County Council 93.0 -0.010753 0.010753 Historic England
1 E60000002 Darlington LPA True 17 Historic England 2 E60000002 Darlington Borough Council 17.0 0.000000 0.000000 Historic England
2 E60000003 Hartlepool LPA True 8 Historic England 2 E60000003 Hartlepool Borough Council 8.0 0.000000 0.000000 Historic England
3 E60000004 Middlesbrough LPA True 8 Historic England 2 E60000004 Middlesbrough Borough Council 8.0 0.000000 0.000000 Historic England
4 E60000005 Northumberland LPA True 68 Historic England 2 E60000005 Northumberland County Council 70.0 -0.028571 0.028571 Historic England

Get single LPA layers¶

Where we have manual CA counts from organisations which are now technically "retired" LPAs (i.e. replaced by a newer LPA), it indicates that the data is still divided and provided by these historic orgs. In these cases we don't want to show the new 2023 LPA on the map at the same time as it overlaps and is confusing as we haven't technically collected data from this new org.

So we want to find the new 2023 LPAs which sit over retired LPAs that have supplied us with data, so we can remove them from the map and get a single contiguous layer which is a mix of historic and current LPA boundaries.

In [13]:
# create gdf of the match counts for all LPAs
lpa_ca_join_count_gdf = lpa_gdf[["reference", "geometry"]].merge(
    lpa_ca_join_count[["reference", "lpa_name", "lpa_2023", "provider_org_type", "platform_data_providers", "count_platform", "count_manual", "count_delta"]],
    how = "left",
    on = "reference"
)
In [14]:
# old lpas = those which are not a 2023 boundary and we have data on the platform for
old_lpas = lpa_ca_join_count_gdf[(lpa_ca_join_count_gdf["lpa_2023"] == False) & (lpa_ca_join_count_gdf["count_manual"].notnull())]

# buffer the boundaries of new 2023 lpas a bit, so we can find which old ones are contained within them
buffered_new_lpas = lpa_ca_join_count_gdf[(lpa_ca_join_count_gdf["lpa_2023"] == True)][["reference", "lpa_name", "geometry"]].copy()
buffered_new_lpas["geometry"] = buffered_new_lpas["geometry"].buffer(100)

# new 2023 lpas to flag are those which have an "old" lpa within them
new_lpas = gpd.sjoin(
    old_lpas[["reference", "lpa_name", "lpa_2023", "geometry"]],
    buffered_new_lpas,
    how = "inner",
    predicate = "within"
)
In [15]:
# create a new flag in lpa gpd for whether or not LPAs are included in the historic & new combo single layer

old_lpas_incl_list = old_lpas["reference"].drop_duplicates().values
new_lpas_excl_list = new_lpas["reference_right"].drop_duplicates().values

lpa_ca_join_count_gdf["old_lpa_combo_display"] = np.select(
    [
        # is in old exclude list - show
        lpa_ca_join_count_gdf["reference"].isin(old_lpas_incl_list),
        # is new and not in the new exclude list - show 
        (lpa_ca_join_count_gdf["lpa_2023"] == True) & (~lpa_ca_join_count_gdf["reference"].isin(new_lpas_excl_list))
    ],
    [True, True],
    default = False
)
In [16]:
# bin the count comparison delta
bins = [-np.inf, -0.5, -0.4, -0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3, 0.4, 0.5, np.inf]
labels = ["< -50%", "-50% : -40%", "-40% : -30%", "-30% : -20%", "-20% : -10%", "-10% : 0", "0 : +10%", "+10% : +20%", "+20% : +30%", "+30% : +40%", "+40% : +50%", "> +50% "]

lpa_ca_join_count_gdf["count_delta_bins"] = pd.cut(lpa_ca_join_count_gdf["count_delta"], bins = bins, labels = labels)
In [17]:
# simplify geometry for smaller maps (helps running in colab)
lpa_ca_join_count_gdf["geometry"] = lpa_ca_join_count_gdf["geometry"].simplify(100)

Map¶

Conservation area national provision¶

In [21]:
def org_type_colormap(value):  # scalar value defined in 'column'
    if value == "LPA":
        return "#28A197"
    if value == "Historic England":
        return "#12436D"
    return "grey"

lpa_ca_join_count_gdf[lpa_ca_join_count_gdf["old_lpa_combo_display"] == True].explore(
    tiles = "CartoDB positron",
    column = "provider_org_type", 
    tooltip = ["reference", "lpa_name", "platform_data_providers"],
    cmap = ["#1d70b8", "#28A197", "white"],
    style_kwds=dict(color="black", weight = 1))
Out[21]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [22]:
# gdf for matches
count_comp_equal_gdf = lpa_ca_join_count_gdf[
    (lpa_ca_join_count_gdf["old_lpa_combo_display"] == True) &
    (lpa_ca_join_count_gdf["count_platform"] == lpa_ca_join_count_gdf["count_manual"])].copy()

count_comp_equal_gdf["count_comparison"] = "match"

# gdf for mis-matches
count_comp_gdf = lpa_ca_join_count_gdf[
    (lpa_ca_join_count_gdf["old_lpa_combo_display"] == True) &
    (lpa_ca_join_count_gdf["count_platform"] > 0) &
    (lpa_ca_join_count_gdf["count_platform"] != lpa_ca_join_count_gdf["count_manual"])].copy()

Conservation area count comparison: matches¶

In [23]:
# show areas where the site count vs. manual count matches

count_comp_equal_gdf.explore(
    tiles = "CartoDB positron",
    color = "green", 
    tooltip = ["reference", "lpa_name", "platform_data_providers", "count_platform", "count_manual", "count_comparison"],
    style_kwds=dict(color="black", weight = 1, fillOpacity = 0.3)
)
Out[23]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Conservation area count comparison: mis-matches¶

In [24]:
# show map of areas where there are mis-matches between site and manual CA count

count_comp_gdf.explore(
    tiles = "CartoDB positron",
        column = "count_delta_bins", 
        tooltip = ["reference", "lpa_name", "platform_data_providers", "count_platform", "count_manual", "count_delta", "count_delta_bins"],
        cmap = "coolwarm",
    style_kwds=dict(color="black", weight = 1)
)
Out[24]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]: